List of used packages:

library(limma)
library(bapred)
library(Harman)
library(ggplot2)
library(sva)
library(ggbiplot)
library(heatmaply)
library(EnhancedVolcano)
library(ggVennDiagram)
library(mixOmics)
library(dplyr)

1 BMC

Limma package provides BMC (batch mean centering) method for batch correction. BMC method includes centering the variables within batches to have zero mean.

1.1 Batch correction

data_limma <- data_norm_quantile_max
batch <- as.numeric(data_factors$Series)

ex <- cbind(data_limma, batch)

data_limma_corrected <- removeBatchEffect(data_limma, batch)

1.2 PCA

# Year 1 and 2
pca_limma <- prcomp(t(data_limma_corrected), center = T, scale. = F)

ggbiplot(pca_limma, ellipse = TRUE, groups = data_factors$Series, labels = NULL, var.axes = FALSE, alpha = 0.7) +
  labs(title = "After correction (BMC)", color = "Year") +
  scale_color_manual(values = c("#0a9278","#f57002")) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

ggbiplot(pca_limma, ellipse = TRUE, groups = data_factors$Differentiation, labels = NULL, var.axes = FALSE, alpha = 0.7) +
  geom_point(aes(col = data_factors$Differentiation, shape = data_factors$Series), size = 2) +
  labs(title = "After correction (BMC)", color = "Group", shape = "Year") +
  scale_color_manual(values = c("#0a9278","#f57002")) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

1.3 Differential expression

X_year <- model.matrix(~ Series, data = data_factors)

fit_y <- lmFit(data_limma_corrected, design = X_year, method = "robust", maxit = 1000)

efit_y <- eBayes(fit_y)

topTable(efit_y, coef = 2)
##           logFC  AveExpr          t      P.Value    adj.P.Val         B
## PIN1  -2.482245 21.45065 -10.246949 6.804538e-12 8.063395e-09 17.097290
## RLA1  -3.979011 22.12665  -9.980369 1.336105e-11 8.063395e-09 16.411783
## GBG12  1.516407 23.31082   8.452290 7.642019e-10 2.470333e-07 12.508952
## ESYT2  2.753634 20.54996   8.427233 8.186687e-10 2.470333e-07 12.442004
## LPP    5.051752 20.19059   8.198148 1.541825e-09 3.721965e-07 11.828724
## XPO1   1.712880 22.44604   8.008051 2.619824e-09 5.270213e-07 11.306186
## SPRE  -1.201001 21.34965  -7.391854 1.503245e-08 2.592023e-06  9.604051
## RAB21 -1.084980 21.81001  -6.489600 2.079114e-07 3.136863e-05  7.026932
## BAG2  -1.783249 22.38747  -6.375802 2.909325e-07 3.901728e-05  6.687549
## GNAS2 -1.017916 22.30146  -6.227559 4.512263e-07 4.951183e-05  6.277263
num_spots <- nrow(data_norm_quantile_max)
full_list_y <- topTable(efit_y, coef = 2, number = num_spots,
                        sort.by = "none")

Draw heatmap:

p_above_y <- full_list_y$adj.P.Val <= 0.05
dif_data_limma <- data_norm_quantile_max[p_above_y, ]

heatmaply(dif_data_limma, main = "Year 1 and 2", fontsize_row = 1, k_col = NA, dendrogram = "col", scale_fill_gradient_fun = ggplot2::scale_fill_gradient2(low = "lightseagreen", high = "orangered3", midpoint = 15))

Draw Volcano plot:

EnhancedVolcano(full_list_y,
                lab = rownames(full_list_y),
                title = "After correction (BMC)",
                subtitle = NULL,
                x = 'logFC',
                y = 'adj.P.Val',
                pCutoff = 0.05,
                FCcutoff = 0.1,
                col = c("lightcyan4","#f57002", "#ee9f02", "#0a9278"),
                legend = c("Not significant","Log2FC","Padj","Padj & Log2FC"),
                legendPosition = "right")

2 Ratio A

Ratio A method of batch correction is ratio-based method scaling the expression values by the arithmetic mean. “bapred” package provides possibility to correct batch effect by Ratio A method.

2.1 Batch correction:

batch <- data_factors$Series
ratio_A_data <- ratioa(t(data_norm_quantile_max), batch)
ratio_A_corrected <- ratio_A_data$xadj

2.2 PCA

pca_ratio_A <- prcomp(ratio_A_corrected, center = T, scale. = F)

ggbiplot(pca_ratio_A, ellipse = TRUE, groups = data_factors$Series, labels = NULL, var.axes = FALSE, alpha = 0.7) +
  scale_color_manual(values = c("#0a9278","#f57002")) +
  labs(title = "After correction (Ratio A)", color = "Year") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

ggbiplot(pca_ratio_A, ellipse = TRUE, groups = data_factors$Differentiation, labels = NULL, var.axes = FALSE, alpha = 0.7) +
  geom_point(aes(col = data_factors$Differentiation, shape = data_factors$Series), size = 2) +
  scale_color_manual(values = c("#0a9278","#f57002")) +
  labs(title = "After correction (Ratio A)", color = "Group", shape = "Year") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

2.3 Differential expression

X_year <- model.matrix(~ Series, data = data_factors)

fit_y <- lmFit(t(ratio_A_corrected), design = X_year, method = "robust", maxit = 1000)

efit_y <- eBayes(fit_y)

topTable(efit_y, coef = 2)
##             logFC AveExpr          t      P.Value    adj.P.Val         B
## RLA1  -0.23037822       1 -12.546370 3.578461e-14 4.319203e-11 22.305441
## PIN1  -0.12803484       1 -12.185600 7.987779e-14 4.820625e-11 21.495688
## LPP    0.31313652       1  11.473467 4.080263e-13 1.641626e-10 19.888912
## ESYT2  0.15642663       1  10.462180 4.609192e-12 1.390824e-09 17.494910
## GBG12  0.06799390       1   9.176459 1.216781e-10 2.937308e-08 14.155468
## CKAP5  0.20720119       1   9.052403 1.687874e-10 3.395440e-08 13.905262
## XPO1   0.08379064       1   8.524353 6.949554e-10 1.198302e-07 12.413917
## SPRE  -0.05967107       1  -8.320469 1.211643e-09 1.828067e-07 11.879297
## BAG2  -0.08927132       1  -6.945037 5.849848e-08 7.845297e-06  7.970973
## AP1M1  0.14431384       1   6.861860 7.442363e-08 8.982933e-06  7.755355
num_spots <- nrow(data_norm_quantile_max)
full_list_y <- topTable(efit_y, coef = 2, number = num_spots,
                        sort.by = "none")

Draw heatmap:

p_above_y <- full_list_y$adj.P.Val <= 0.05
dif_data_ratio_A_year <- data_norm_quantile_max[p_above_y, ]

heatmaply(dif_data_ratio_A_year, main = "Year 1 and 2", fontsize_row = 1, k_col = NA, dendrogram = "col", scale_fill_gradient_fun = ggplot2::scale_fill_gradient2(low = "lightseagreen", high = "orangered3", midpoint = 15))

Draw Volcano plot:

EnhancedVolcano(full_list_y,
                lab = rownames(full_list_y),
                title = "After correction (Ratio A)",
                subtitle = NULL,
                x = 'logFC',
                y = 'adj.P.Val',
                pCutoff = 0.05,
                FCcutoff = 0.1,
                col = c("lightcyan4","#f57002", "#ee9f02", "#0a9278"),
                legend = c("Not significant","Log2FC","Padj","Padj & Log2FC"),
                legendPosition = "right")

3 Ratio G

Ratio G method of batch correction is ratio-based method scaling the expression values by the geometric mean. Batch effect may be corrected by Ratio G method using “bapred” package. ## Batch correction

ratio_G_data <- ratiog(t(data_norm_quantile_max), batch)
ratio_G_corrected <- ratio_G_data$xadj

3.1 PCA

pca_ratio_G <- prcomp(ratio_G_corrected, center = T, scale. = F)

ggbiplot(pca_ratio_G, ellipse = TRUE, groups = data_factors$Series, labels = NULL, var.axes = FALSE, alpha = 0.7) +
  scale_color_manual(values = c("#0a9278","#f57002")) +
  labs(title = "After correction (Ratio G)", color = "Year") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

ggbiplot(pca_ratio_G, ellipse = TRUE, groups = data_factors$Differentiation, labels = NULL, var.axes = FALSE, alpha = 0.7) +
  geom_point(aes(col = data_factors$Differentiation, shape = data_factors$Series), size = 2) +
  scale_color_manual(values = c("#0a9278","#f57002")) +
  labs(title = "After correction (Ratio G)", color = "Group", shape = "Year") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

3.2 Differential expression

X_year <- model.matrix(~ Series, data = data_factors)

fit_y <- lmFit(t(ratio_G_corrected), design = X_year, method = "robust", maxit = 1000)

efit_y <- eBayes(fit_y)

topTable(efit_y, coef = 2)
##            logFC  AveExpr         t      P.Value    adj.P.Val        B
## GBG12  0.2091473 1.056444  27.80813 1.280375e-24 1.545412e-21 46.15956
## LPP    0.6944541 1.120952  24.23900 1.004305e-22 6.060980e-20 41.81046
## RLA1  -0.5396549 1.144792 -23.53038 2.557164e-22 1.028832e-19 40.89738
## ESYT2  0.3592177 1.073850  23.20649 3.952974e-22 1.192810e-19 40.44376
## PIN1  -0.2165076 1.045114 -20.51109 1.853548e-20 4.474465e-18 36.66458
## CKAP5  0.4232623 1.076112  18.04819 9.300528e-19 1.870956e-16 32.76947
## XPO1   0.1426994 1.023223  14.48104 6.311314e-16 1.088251e-13 26.15817
## CHMP3  0.1837258 1.046782  13.72902 2.907309e-15 4.127768e-13 24.60568
## GNAS2 -0.1065551 1.033405 -13.65076 3.419858e-15 4.127768e-13 24.46003
## GNAS1 -0.1065551 1.033405 -13.65076 3.419858e-15 4.127768e-13 24.46003
num_spots <- nrow(data_norm_quantile_max)
full_list_y <- topTable(efit_y, coef = 2, number = num_spots,
                        sort.by = "none")

Draw heatmap:

p_above_y <- full_list_y$adj.P.Val <= 0.05
dif_data_ratio_G_year <- data_norm_quantile_max[p_above_y, ]

heatmaply(dif_data_ratio_G_year, main = "Year 1 and 2", fontsize_row = 1, k_col = NA, dendrogram = "col", scale_fill_gradient_fun = ggplot2::scale_fill_gradient2(low = "lightseagreen", high = "orangered3", midpoint = 15))

Draw Volcano plot:

EnhancedVolcano(full_list_y,
                lab = rownames(full_list_y),
                title = "After correction (Ratio G)",
                subtitle = NULL,
                x = 'logFC',
                y = 'adj.P.Val',
                pCutoff = 0.05,
                FCcutoff = 0.1,
                col = c("lightcyan4","#f57002", "#ee9f02", "#0a9278"),
                legend = c("Not significant","Log2FC","Padj","Padj & Log2FC"),
                legendPosition = "right")

4 Harman

Harman method is based on PCA. It reduces batch effect and keeps user-defined class effects. Harman batch correction is provided by eponymous package “Harman”.

4.1 Batch correction

expt <- data_factors$Differentiation
batch <- data_factors$Series
data.harman <- harman(data_norm_quantile_max, expt, batch)
data_corrected_harman <- reconstructData(data.harman)

4.2 PCA

pca_harman <- prcomp(t(data_corrected_harman), center = T, scale. = F)

ggbiplot(pca_harman, ellipse = TRUE, groups = data_factors$Series, labels = NULL, var.axes = FALSE, alpha = 0.7) +
  scale_color_manual(values = c("#0a9278","#f57002")) +
  labs(title = "After correction (Harman)", color = "Year") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

ggbiplot(pca_harman, ellipse = TRUE, groups = data_factors$Differentiation, labels = NULL, var.axes = FALSE, alpha = 0.7) +
  geom_point(aes(col = data_factors$Differentiation, shape = data_factors$Series), size = 2) +
  scale_color_manual(values = c("#0a9278","#f57002")) +
  labs(title = "After correction (Harman)", color = "Group", shape = "Year") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

4.3 Differential expression

X_year <- model.matrix(~ Series, data = data_factors)

fit_y <- lmFit(data_corrected_harman, design = X_year, method = "robust", maxit = 1000)

efit_y <- eBayes(fit_y)

topTable(efit_y, coef = 2)
##           logFC  AveExpr         t      P.Value    adj.P.Val         B
## RAB21 -1.341754 21.70762 -8.025443 2.495338e-09 3.011872e-06 11.359588
## TPM2  -1.764867 25.17584 -7.688177 6.454292e-09 3.895165e-06 10.440436
## LPP    4.623572 20.73165  7.503283 1.092626e-08 4.395999e-06  9.930884
## PIN1  -1.717666 21.18474 -7.090695 3.583175e-08 9.234291e-06  8.781506
## RLA1  -2.817956 21.39513 -7.068148 3.825307e-08 9.234291e-06  8.722626
## XPO1   1.459329 22.76890  6.822651 7.819325e-08 1.572987e-05  8.021715
## ESYT2  2.182863 20.94196  6.680443 1.185929e-07 2.044881e-05  7.633963
## GBG12  1.078936 23.45918  6.013871 8.511862e-07 1.284227e-04  5.698372
## RAB34 -1.020447 22.21378 -5.873708 1.292015e-06 1.732736e-04  5.284679
## TBA1C -3.304413 21.39006 -5.711448 2.096051e-06 2.529934e-04  4.828876
num_spots <- nrow(data_norm_quantile_max)
full_list_y <- topTable(efit_y, coef = 2, number = num_spots,
                        sort.by = "none")

Draw heatmap:

p_above_y <- full_list_y$adj.P.Val <= 0.05
dif_data_harman_year <- data_norm_quantile_max[p_above_y, ]

heatmaply(dif_data_harman_year, main = "Year 1 and 2", fontsize_row = 1, k_col = NA, dendrogram = "col", scale_fill_gradient_fun = ggplot2::scale_fill_gradient2(low = "lightseagreen", high = "orangered3", midpoint = 15))

Draw Volcano plot:

EnhancedVolcano(full_list_y,
                lab = rownames(full_list_y),
                title = "After correction (Harman)",
                subtitle = NULL,
                x = 'logFC',
                y = 'adj.P.Val',
                pCutoff = 0.05,
                FCcutoff = 0.1,
                col = c("lightcyan4","#f57002", "#ee9f02", "#0a9278"),
                legend = c("Not significant","Log2FC","Padj","Padj & Log2FC"),
                legendPosition = "right")

5 Combat

5.1 Batch correction

pheno <- data_factors
edata <- data_norm_quantile_max
colnames(pheno)[5] <- "batch"

mod <- model.matrix(~ Differentiation + Health, data = pheno)
mod0 <- model.matrix(~ 1, data = pheno)

n.sv <- num.sv(edata, mod, method = "leek")
n.sv
## [1] 1
svobj <- sva(edata, mod, mod0, n.sv = n.sv)
## Number of significant surrogate variables is:  1 
## Iteration (out of 5 ):1  2  3  4  5
modSv <- cbind(mod, svobj$sv)
mod0Sv <- cbind(mod0, svobj$sv)
pValuesSv <- f.pvalue(edata, modSv, mod0Sv)
qValuesSv <- p.adjust(pValuesSv,method = "BH")

fit <- lmFit(edata,modSv)

contrast.matrix <- cbind("C1"= c(-1, 1, 0, rep(0,svobj$n.sv)),"C2"= c(0, -1, 1, rep(0,svobj$n.sv)))
fitContrasts <- contrasts.fit(fit,contrast.matrix)

eb <- eBayes(fitContrasts)

batch <- pheno$batch

modcombat <- model.matrix(~ 1, data = pheno)
combat_edata <- ComBat(dat = edata, batch = batch, mod = modcombat, par.prior = TRUE, prior.plots = TRUE)
## Standardizing Data across genes

5.2 PCA

pca_sva_combat <- prcomp(t(combat_edata), center = T, scale. = F)

ggbiplot(pca_sva_combat, ellipse = TRUE, groups = data_factors$Series, labels = NULL, var.axes = FALSE, alpha = 0.7) +
  scale_color_manual(values = c("#0a9278","#f57002")) +
  labs(title = "After correction (ComBat)", color = "Year") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

ggbiplot(pca_sva_combat, ellipse = TRUE, groups = data_factors$Differentiation, labels = NULL, var.axes = FALSE, alpha = 0.7) +
  geom_point(aes(col = data_factors$Differentiation, shape = data_factors$Series), size = 2) +
  scale_color_manual(values = c("#0a9278","#f57002")) +
  labs(title = "After correction (ComBat)", color = "Group", shape = "Year") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

5.3 Differential expression

X <- model.matrix(~ Series, data = data_factors)

fit <- lmFit(combat_edata, design = X, method = "robust", maxit = 1000)

efit <- eBayes(fit)

topTable(efit, coef = 2)
##            logFC  AveExpr         t      P.Value  adj.P.Val           B
## ESYT2  1.6235873 20.92591  4.622476 5.357084e-05 0.03452158  1.69659236
## GBG12  0.8940414 23.45268  4.600204 5.720228e-05 0.03452158  1.69093625
## RLA1  -2.3532854 21.21166 -4.234503 1.665726e-04 0.06701770  0.73944472
## LPP    2.5640940 20.70977  3.914258 4.178299e-04 0.12608018 -0.02490026
## E2AK2 -0.8187566 22.56392 -3.649384 8.803275e-04 0.21251105 -0.63692439
## PIN1  -1.5408022 21.12110 -3.491923 1.360056e-03 0.27359792 -1.00017291
## GGYF2  0.9471270 22.10664  3.368565 1.903222e-03 0.28376434 -1.28880942
## SPRE  -0.7000253 21.14478 -3.346589 2.019688e-03 0.28376434 -1.34266618
## XPO1   0.7883921 22.75666  3.329333 2.115890e-03 0.28376434 -1.37208511
## VATH  -0.9287984 23.05284 -3.226370 2.787655e-03 0.33646991 -1.60346169
num_spots <- nrow(combat_edata)
full_list <- topTable(efit, coef = 2, number = num_spots,
                      sort.by = "none")

Draw heatmap:

p_above <- full_list$adj.P.Val <= 0.05
dif_data_combat <- combat_edata[p_above, ]

heatmaply(dif_data_combat, main = "Control vs Healthy", fontsize_row = 1, k_col = 3, dendrogram = "col", scale_fill_gradient_fun = ggplot2::scale_fill_gradient2(low = "lightseagreen", high = "orangered3", midpoint = 15))

Draw Volcano plot:

EnhancedVolcano(full_list,
                lab = rownames(full_list),
                title = "After correction (ComBat)",
                subtitle = NULL,
                x = 'logFC',
                y = 'adj.P.Val',
                pCutoff = 0.05,
                FCcutoff = 0.1,
                col = c("lightcyan4","#f57002", "#ee9f02", "#0a9278"),
                legend = c("Not significant","Log2FC","Padj","Padj & Log2FC"),
                legendPosition = "right")

6 Comparison of batch correction methods

Firstly, we compared the number of remaining proteins whose expression differ depending on year of experiment. Ideally, there must be no such proteins.

x <- list(rownames(dif_data_limma), rownames(combat_edata), rownames(dif_data_ratio_A_year), rownames(dif_data_ratio_G_year), rownames(dif_data_harman_year))

ggVennDiagram(x,category.names = c("BMC","Combat", "Ratio A", "Ratio G", "Harman"), label_alpha = 0) +
  scale_fill_gradient(low = "palegreen3", high = "#0a9278") +
  labs(title = "Differentially expressed proteins after \n batch correction \n (condition = 'Year')") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

It could be seen that ComBat method considerably reduced the number of such proteins (only 2 proteins remained). Interestingly, Ratio G method decreased the number of undesirable proteins, but most of them (59) were unique.

7 PLS-DA

Secondly, we compared PLS-DA results. PLS-DA (Partial least squares-discriminant analysis) is a linear classification model that is able to predict the class of samples. Here we used PLS-DA in order to compare prediction of classes:

  • Control cells obtained in 1 year
  • Control cells obtained in 2 year
  • Differentiated cells obtained in 1 year
  • Differentiated cells obtained in 2 year

Classes of one year should be predicted with similar probability

7.1 Data without correction

data_factors$Series_Differentiation <- factor(apply(data_factors[, c(5, 4)], 1, paste, collapse = "_"))

X <- t(data_norm_quantile_max)
Y <- data_factors$Series_Differentiation 
summary(Y)
##         1_Control 1_Differentiation         2_Control 2_Differentiation 
##                 9                10                 7                 7
list.keepX <- c(5:10,  seq(20, 100, 10))
set.seed(30) 
tune.splsda.srbct <- tune.splsda(X, Y, ncomp = 3,
                                 validation = 'Mfold',
                                 folds = 3, dist = 'max.dist', progressBar = FALSE,
                                 measure = "BER", test.keepX = list.keepX,
                                 nrepeat = 10)   

ncomp <- tune.splsda.srbct$choice.ncomp$ncomp 
ncomp
## [1] 2
select.keepX <- tune.splsda.srbct$choice.keepX[1:ncomp] 
select.keepX
## comp1 comp2 
##     5    20
MyResult.splsda.fixed <- splsda(X, Y,ncomp = ncomp, keepX = select.keepX)

layout(matrix(c(1, 2, 3, 3, 3, 3), 2, 3))
plotLoadings(MyResult.splsda.fixed, comp = 1, size.name = 1, size.title = 1.2, title = "Loadings\n on 1st component", contrib = "max", legend = FALSE, col.ties="black", ndisplay = 10)
plotLoadings(MyResult.splsda.fixed, comp = 2, size.name = 1, size.title = 1.2, title = "Loadings\n on 2nd component", contrib = "max",ndisplay = 10,  legend = FALSE, col.ties="black")
plotIndiv(MyResult.splsda.fixed, ind.names = F, ellipse = T, style = "graphics", abline = TRUE, cex = 2, pch = 19, size.axis = 1.2, size.xlabel = 1.5, size.ylabel = 1.5, title = "sPLS-DA ordination of samples (No correction)", size.title = 1.5)
legend("bottomright", legend = levels(data_factors$Series_Differentiation), cex = 1, fill = color.mixo(1:4), bty = "n")

auc.plsda <- auroc(MyResult.splsda.fixed)

## $Comp1
##                                  AUC   p-value
## 1_Control vs Other(s)         0.6296 2.577e-01
## 1_Differentiation vs Other(s) 0.9565 3.903e-05
## 2_Control vs Other(s)         0.7802 2.471e-02
## 2_Differentiation vs Other(s) 0.9505 3.050e-04
## 
## $Comp2
##                               AUC   p-value
## 1_Control vs Other(s)           1 1.267e-05
## 1_Differentiation vs Other(s)   1 6.640e-06
## 2_Control vs Other(s)           1 6.140e-05
## 2_Differentiation vs Other(s)   1 6.140e-05

7.2 BMC

data_factors$Series_Differentiation <- factor(apply(data_factors[, c(5, 4)], 1, paste, collapse = "_"))

X <- t(data_limma_corrected)
Y <- data_factors$Series_Differentiation
summary(Y)
##         1_Control 1_Differentiation         2_Control 2_Differentiation 
##                 9                10                 7                 7
list.keepX <- c(5:10,  seq(20, 100, 10))
set.seed(30) 
tune.splsda.srbct <- tune.splsda(X, Y, ncomp = 3,
                                 validation = 'Mfold',
                                 folds = 3, dist = 'max.dist', progressBar = FALSE,
                                 measure = "BER", test.keepX = list.keepX,
                                 nrepeat = 10)   

ncomp <- tune.splsda.srbct$choice.ncomp$ncomp 
ncomp
## [1] 2
select.keepX <- tune.splsda.srbct$choice.keepX[1:ncomp] 
select.keepX
## comp1 comp2 
##    30    90
MyResult.splsda.fixed <- splsda(X, Y,ncomp = ncomp, keepX = select.keepX)

layout(matrix(c(1, 2, 3, 3, 3, 3), 2, 3))
plotLoadings(MyResult.splsda.fixed, comp = 1, size.name = 1, size.title = 1.2, title = "Loadings\n on 1st component", contrib = "max", legend = FALSE, col.ties="black", ndisplay = 10)
plotLoadings(MyResult.splsda.fixed, comp = 2, size.name = 1, size.title = 1.2, title = "Loadings\n on 2nd component", contrib = "max",ndisplay = 10,  legend = FALSE, col.ties="black")
plotIndiv(MyResult.splsda.fixed, ind.names = F, ellipse = T, style = "graphics", abline = TRUE, cex = 2, pch = 19, size.axis = 1.2, size.xlabel = 1.5, size.ylabel = 1.5, title = "sPLS-DA ordination of samples (BMC)", size.title = 1.5)
legend(x = "topleft", legend = levels(data_factors$Series_Differentiation), cex = 1, fill = color.mixo(1:4), bty = "n")

auc.plsda <- auroc(MyResult.splsda.fixed)

## $Comp1
##                                  AUC   p-value
## 1_Control vs Other(s)         0.9907 1.829e-05
## 1_Differentiation vs Other(s) 0.9304 1.053e-04
## 2_Control vs Other(s)         0.6648 1.865e-01
## 2_Differentiation vs Other(s) 0.7033 1.032e-01
## 
## $Comp2
##                                  AUC   p-value
## 1_Control vs Other(s)         1.0000 1.267e-05
## 1_Differentiation vs Other(s) 0.9826 1.372e-05
## 2_Control vs Other(s)         0.9890 8.881e-05
## 2_Differentiation vs Other(s) 1.0000 6.140e-05

7.3 Ratio A

data_factors$Series_Differentiation <- factor(apply(data_factors[, c(5, 4)], 1, paste, collapse = "_"))

X <- ratio_A_corrected
Y <- data_factors$Series_Differentiation
summary(Y)
##         1_Control 1_Differentiation         2_Control 2_Differentiation 
##                 9                10                 7                 7
list.keepX <- c(5:10,  seq(20, 100, 10))
set.seed(30) 
tune.splsda.srbct <- tune.splsda(X, Y, ncomp = 3,
                                 validation = 'Mfold',
                                 folds = 3, dist = 'max.dist', 
                                 progressBar = FALSE,
                                 measure = "BER",
                                 test.keepX = list.keepX,
                                 nrepeat = 10)   

ncomp <- tune.splsda.srbct$choice.ncomp$ncomp 
ncomp
## [1] 2
select.keepX <- tune.splsda.srbct$choice.keepX[1:ncomp] 
select.keepX
## comp1 comp2 
##    30    70
MyResult.splsda.fixed <- splsda(X, Y,ncomp = ncomp, keepX = select.keepX)

layout(matrix(c(1, 2, 3, 3, 3, 3), 2, 3))
plotLoadings(MyResult.splsda.fixed, comp = 1, size.name = 1, size.title = 1.2, title = "Loadings\n on 1st component", contrib = "max", legend = FALSE, col.ties="black", ndisplay = 10)
plotLoadings(MyResult.splsda.fixed, comp = 2, size.name = 1, size.title = 1.2, title = "Loadings\n on 2nd component", contrib = "max",ndisplay = 10,  legend = FALSE, col.ties="black")
plotIndiv(MyResult.splsda.fixed, ind.names = F, ellipse = T, style = "graphics", abline = TRUE, cex = 2, pch = 19, size.axis = 1.2, size.xlabel = 1.5, size.ylabel = 1.5, title = "sPLS-DA ordination of samples (Ratio A)", size.title = 1.5)
legend(x = "topleft", legend = levels(data_factors$Series_Differentiation), cex = 1, fill = color.mixo(1:4), bty = "n")

auc.plsda <- auroc(MyResult.splsda.fixed)

## $Comp1
##                                  AUC   p-value
## 1_Control vs Other(s)         0.9954 1.524e-05
## 1_Differentiation vs Other(s) 0.9348 8.954e-05
## 2_Control vs Other(s)         0.6593 2.016e-01
## 2_Differentiation vs Other(s) 0.6978 1.129e-01
## 
## $Comp2
##                                  AUC   p-value
## 1_Control vs Other(s)         1.0000 1.267e-05
## 1_Differentiation vs Other(s) 0.9870 1.147e-05
## 2_Control vs Other(s)         0.9945 7.392e-05
## 2_Differentiation vs Other(s) 1.0000 6.140e-05

7.4 Ratio G

data_factors$Series_Differentiation <- factor(apply(data_factors[, c(5, 4)], 1, paste, collapse = "_"))

X <- ratio_G_corrected
Y <- data_factors$Series_Differentiation

list.keepX <- c(5:10,  seq(20, 100, 10))
set.seed(30) 
tune.splsda.srbct <- tune.splsda(X, Y, ncomp = 3,
                                 validation = 'Mfold',
                                 folds = 3, dist = 'max.dist', 
                                 progressBar = FALSE,
                                 measure = "BER",
                                 test.keepX = list.keepX,
                                 nrepeat = 10)   

ncomp <- tune.splsda.srbct$choice.ncomp$ncomp 
ncomp
## [1] 2
select.keepX <- tune.splsda.srbct$choice.keepX[1:ncomp] 
select.keepX
## comp1 comp2 
##    30    70
MyResult.splsda.fixed <- splsda(X, Y,ncomp = ncomp, keepX = select.keepX)

layout(matrix(c(1, 2, 3, 3, 3, 3), 2, 3))
plotLoadings(MyResult.splsda.fixed, comp = 1, size.name = 1, size.title = 1.2, title = "Loadings\n on 1st component", contrib = "max", legend = FALSE, col.ties="black", ndisplay = 10)
plotLoadings(MyResult.splsda.fixed, comp = 2, size.name = 1, size.title = 1.2, title = "Loadings\n on 2nd component", contrib = "max",ndisplay = 10,  legend = FALSE, col.ties="black")
plotIndiv(MyResult.splsda.fixed, ind.names = F, ellipse = T, style = "graphics", abline = TRUE, cex = 2, pch = 19, size.axis = 1.2, size.xlabel = 1.5, size.ylabel = 1.5, title = "sPLS-DA ordination of samples (Ratio G)", size.title = 1.5)
legend(x = "topleft", legend = levels(data_factors$Series_Differentiation), cex = 1, fill = color.mixo(1:4), bty = "n")

auc.plsda <- auroc(MyResult.splsda.fixed)

## $Comp1
##                                  AUC   p-value
## 1_Control vs Other(s)         0.9954 1.524e-05
## 1_Differentiation vs Other(s) 0.9348 8.954e-05
## 2_Control vs Other(s)         0.6593 2.016e-01
## 2_Differentiation vs Other(s) 0.6978 1.129e-01
## 
## $Comp2
##                                  AUC   p-value
## 1_Control vs Other(s)         1.0000 1.267e-05
## 1_Differentiation vs Other(s) 0.9870 1.147e-05
## 2_Control vs Other(s)         0.9945 7.392e-05
## 2_Differentiation vs Other(s) 1.0000 6.140e-05

7.5 Harman

data_factors$Series_Differentiation <- factor(apply(data_factors[, c(5, 4)], 1, paste, collapse = "_"))

X <- t(data_corrected_harman)
Y <- data_factors$Series_Differentiation

list.keepX <- c(5:10,  seq(20, 100, 10))
set.seed(30) 
tune.splsda.srbct <- tune.splsda(X, Y, ncomp = 3,
                                 validation = 'Mfold',
                                 folds = 3, dist = 'max.dist', progressBar = FALSE,
                                 measure = "BER", test.keepX = list.keepX,
                                 nrepeat = 10)   

ncomp <- tune.splsda.srbct$choice.ncomp$ncomp 
ncomp
## [1] 2
select.keepX <- tune.splsda.srbct$choice.keepX[1:ncomp] 
select.keepX
## comp1 comp2 
##    30    80
MyResult.splsda.fixed <- splsda(X, Y,ncomp = ncomp, keepX = select.keepX)

layout(matrix(c(1, 2, 3, 3, 3, 3), 2, 3))
plotLoadings(MyResult.splsda.fixed, comp = 1, size.name = 1, size.title = 1.2, title = "Loadings\n on 1st component", contrib = "max", legend = FALSE, col.ties="black", ndisplay = 10)
plotLoadings(MyResult.splsda.fixed, comp = 2, size.name = 1, size.title = 1.2, title = "Loadings\n on 2nd component", contrib = "max",ndisplay = 10,  legend = FALSE, col.ties="black")
plotIndiv(MyResult.splsda.fixed, ind.names = F, ellipse = T, style = "graphics", abline = TRUE, cex = 2, pch = 19, size.axis = 1.2, size.xlabel = 1.5, size.ylabel = 1.5, title = "sPLS-DA ordination of samples (Harman)", size.title = 1.5)
legend(x = "topleft", legend = levels(data_factors$Series_Differentiation), cex = 1, fill = color.mixo(1:4), bty = "n")

auc.plsda <- auroc(MyResult.splsda.fixed)

## $Comp1
##                                  AUC   p-value
## 1_Control vs Other(s)         1.0000 1.267e-05
## 1_Differentiation vs Other(s) 0.9043 2.694e-04
## 2_Control vs Other(s)         0.6538 2.176e-01
## 2_Differentiation vs Other(s) 0.7363 5.828e-02
## 
## $Comp2
##                                  AUC   p-value
## 1_Control vs Other(s)         1.0000 1.267e-05
## 1_Differentiation vs Other(s) 0.9696 2.330e-05
## 2_Control vs Other(s)         0.9890 8.881e-05
## 2_Differentiation vs Other(s) 1.0000 6.140e-05

7.6 Combat

data_factors$Series_Differentiation <- factor(apply(data_factors[, c(5, 4)], 1, paste, collapse = "_"))

X <- t(combat_edata)
Y <- data_factors$Series_Differentiation
summary(Y)
##         1_Control 1_Differentiation         2_Control 2_Differentiation 
##                 9                10                 7                 7
list.keepX <- c(5:10,  seq(20, 100, 10))
set.seed(30) 
tune.splsda.srbct <- tune.splsda(X, Y, ncomp = 3,
                                 validation = 'Mfold',
                                 folds = 3, dist = 'max.dist', 
                                 progressBar = FALSE,
                                 measure = "BER",
                                 test.keepX = list.keepX,
                                 nrepeat = 10)   

ncomp <- tune.splsda.srbct$choice.ncomp$ncomp 
ncomp
## [1] 2
select.keepX <- tune.splsda.srbct$choice.keepX[1:ncomp] 
select.keepX
## comp1 comp2 
##    50    10
MyResult.splsda.fixed <- splsda(X, Y,ncomp = ncomp, keepX = select.keepX)

layout(matrix(c(1, 2, 3, 3, 3, 3), 2, 3))
plotLoadings(MyResult.splsda.fixed, comp = 1, size.name = 1, size.title = 1.2, title = "Loadings\n on 1st component", contrib = "max", legend = FALSE, col.ties="black", ndisplay = 10)
plotLoadings(MyResult.splsda.fixed, comp = 2, size.name = 1, size.title = 1.2, title = "Loadings\n on 2nd component", contrib = "max",ndisplay = 10,  legend = FALSE, col.ties="black")
plotIndiv(MyResult.splsda.fixed, ind.names = F, ellipse = T, style = "graphics", abline = TRUE, cex = 2, pch = 19, size.axis = 1.2, size.xlabel = 1.5, size.ylabel = 1.5, title = "sPLS-DA ordination of samples (ComBat)", size.title = 1.5)
legend(x = "topleft", legend = levels(data_factors$Series_Differentiation), cex = 1, fill = color.mixo(1:4), bty = "n")

auc.plsda <- auroc(MyResult.splsda.fixed)

## $Comp1
##                                  AUC  p-value
## 1_Control vs Other(s)         0.8565 0.001855
## 1_Differentiation vs Other(s) 0.8391 0.002247
## 2_Control vs Other(s)         0.8242 0.009372
## 2_Differentiation vs Other(s) 0.8187 0.010650
## 
## $Comp2
##                                  AUC   p-value
## 1_Control vs Other(s)         0.9954 1.524e-05
## 1_Differentiation vs Other(s) 0.9652 2.771e-05
## 2_Control vs Other(s)         0.9835 1.065e-04
## 2_Differentiation vs Other(s) 1.0000 6.140e-05

Only application of ComBat resulted in similar prediction probability for classes of first and second years. Usage of other methods led to higher prediction probability for Classes of first year than for second year.

8 gPCA

gPCA (guided PCA) is used to test whether a batch effect exists. “Delta” is a metric that varies between 0 and 1. 1 means that there are batch effect, 0 - no batch. p-value indicates the significance of delta value.

real <- gPCA.batchdetect(t(data_norm_quantile_max), data_factors$Series)
sva <- gPCA.batchdetect(t(combat_edata), data_factors$Series, nperm = 10000)
limma <- gPCA.batchdetect(t(data_limma_corrected), data_factors$Series)
ratio_A <- gPCA.batchdetect(ratio_A_corrected, data_factors$Series)
ratio_G <- gPCA.batchdetect(ratio_G_corrected, data_factors$Series)
harman <- gPCA.batchdetect(t(data_corrected_harman), data_factors$Series)

gPCA_table <- data.frame("Correction method" = c("No correction",
                                                 "ComBat",
                                                 "BMC", 
                                                 "Ratio A", 
                                                 "Ratio G", 
                                                 "Harman"),
                         "gPCA delta" = c(real$delta,
                                          sva$delta,
                                          limma$delta,
                                          ratio_A$delta,
                                          ratio_G$delta,
                                          harman$delta), 
                         "P-value" = c(real$p.val,
                                       sva$p.val,
                                       limma$p.val,
                                       ratio_A$p.val,
                                       ratio_G$p.val,
                                       harman$p.val))
gPCA_table
##   Correction.method  gPCA.delta P.value
## 1     No correction 0.991884924  <0.001
## 2            ComBat 0.073928447       1
## 3               BMC 0.004016163       1
## 4           Ratio A 0.009140765       1
## 5           Ratio G 0.383233788   0.786
## 6            Harman 0.315603754   0.908

9 Conclusion

Thus, ComBat method was used in further differential expression and GO enrichment analyses.